{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3d30cc80",
   "metadata": {},
   "source": [
    "# 为了便于调式各个py文件，建立一个主脚本my_main"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5749ad2a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "股票信息提取完成\n",
      "概念信息提取完成\n"
     ]
    },
    {
     "ename": "ConnectTimeout",
     "evalue": "HTTPConnectionPool(host='api.waditu.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fd27a28efd0>, 'Connection to api.waditu.com timed out. (connect timeout=30)'))",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mtimeout\u001b[0m                                   Traceback (most recent call last)",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    169\u001b[0m             conn = connection.create_connection(\n\u001b[0;32m--> 170\u001b[0;31m                 \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dns_host\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mport\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mextra_kw\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    171\u001b[0m             )\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m     95\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     97\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m     85\u001b[0m                 \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m             \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     87\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mtimeout\u001b[0m: timed out",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mConnectTimeoutError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m    705\u001b[0m                 \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 706\u001b[0;31m                 \u001b[0mchunked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunked\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    707\u001b[0m             )\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m    393\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 394\u001b[0;31m                 \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mhttplib_request_kw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    395\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m    233\u001b[0m             \u001b[0mheaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"User-Agent\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_default_user_agent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 234\u001b[0;31m         \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    235\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m   1276\u001b[0m         \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1277\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1278\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m   1322\u001b[0m             \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_encode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1323\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1324\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m   1271\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1272\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1273\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m   1031\u001b[0m         \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1032\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1033\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m    971\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_open\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 972\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    973\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    199\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 200\u001b[0;31m         \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    201\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    176\u001b[0m                 \u001b[0;34m\"Connection to %s timed out. (connect timeout=%s)\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 177\u001b[0;31m                 \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    178\u001b[0m             )\n",
      "\u001b[0;31mConnectTimeoutError\u001b[0m: (<urllib3.connection.HTTPConnection object at 0x7fd27a28efd0>, 'Connection to api.waditu.com timed out. (connect timeout=30)')",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mMaxRetryError\u001b[0m                             Traceback (most recent call last)",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m    448\u001b[0m                     \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m                     \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    450\u001b[0m                 )\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m    755\u001b[0m             retries = retries.increment(\n\u001b[0;32m--> 756\u001b[0;31m                 \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_pool\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_stacktrace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    757\u001b[0m             )\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/urllib3/util/retry.py\u001b[0m in \u001b[0;36mincrement\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m    573\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnew_retry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_exhausted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 574\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mMaxRetryError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcause\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    575\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mMaxRetryError\u001b[0m: HTTPConnectionPool(host='api.waditu.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fd27a28efd0>, 'Connection to api.waditu.com timed out. (connect timeout=30)'))",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mConnectTimeout\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m~/Documents/mycode/project4/step1_get_data.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     29\u001b[0m     \u001b[0;31m# 获取该概念下的全部股票列表\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     30\u001b[0m     \u001b[0;31m# https://waditu.com/document/2?doc_id=126\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m     \u001b[0mconcept_stocks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpro\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcept_detail\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconcept_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     32\u001b[0m     \u001b[0mconcept_details\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconcept_details\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconcept_stocks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[0mconcept_details\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./data/knowledge/股票-概念信息.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'gbk'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/tushare/pro/client.py\u001b[0m in \u001b[0;36mquery\u001b[0;34m(self, api_name, fields, **kwargs)\u001b[0m\n\u001b[1;32m     38\u001b[0m         }\n\u001b[1;32m     39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m         \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__http_url\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreq_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'Connection'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m'close'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     41\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     42\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/requests/api.py\u001b[0m in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m    117\u001b[0m     \"\"\"\n\u001b[1;32m    118\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'post'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    120\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    121\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m     59\u001b[0m     \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     62\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m    540\u001b[0m         }\n\u001b[1;32m    541\u001b[0m         \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 542\u001b[0;31m         \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    543\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    544\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m    653\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    654\u001b[0m         \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 655\u001b[0;31m         \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    656\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    657\u001b[0m         \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/envs/mypy37/lib/python3.7/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m    502\u001b[0m                 \u001b[0;31m# TODO: Remove this in 3.0.0: see #2811\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    503\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreason\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNewConnectionError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 504\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0mConnectTimeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    505\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    506\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreason\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mConnectTimeout\u001b[0m: HTTPConnectionPool(host='api.waditu.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fd27a28efd0>, 'Connection to api.waditu.com timed out. (connect timeout=30)'))"
     ]
    }
   ],
   "source": [
    "#执行step1获取数据\n",
    "#充值了50元获得500积分，能够下载部分信息，剩余文件使用老师提供的以节省资金\n",
    "%run step1_get_data.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "68c68d8e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Graph('http://localhost:7474')\n",
      "创建 股票 实体...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2523it [01:18, 41.05it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error: [Request.InvalidFormat] Could not parse the incoming JSON, data idx: 2517, data: Unnamed: 0         2517\n",
      "ts_code       430057.BJ\n",
      "symbol           430057\n",
      "name           北交所测试代码1\n",
      "industry            NaN\n",
      "Name: 2517, dtype: object\n",
      "Error: [Request.InvalidFormat] Could not parse the incoming JSON, data idx: 2518, data: Unnamed: 0         2518\n",
      "ts_code       430198.BJ\n",
      "symbol           430198\n",
      "name           微创光电(测试)\n",
      "industry            NaN\n",
      "Name: 2518, dtype: object\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "4510it [02:11, 34.29it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error: [Request.InvalidFormat] Could not parse the incoming JSON, data idx: 4507, data: Unnamed: 0         4507\n",
      "ts_code       834765.BJ\n",
      "symbol           834765\n",
      "name            美之高(测试)\n",
      "industry            NaN\n",
      "Name: 4507, dtype: object\n",
      "Error: [Request.InvalidFormat] Could not parse the incoming JSON, data idx: 4508, data: Unnamed: 0         4508\n",
      "ts_code       836433.BJ\n",
      "symbol           836433\n",
      "name           大唐药业(测试)\n",
      "industry            NaN\n",
      "Name: 4508, dtype: object\n",
      "创建 概念 实体...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "879it [00:27, 31.75it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "创建 股东 实体...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "3013it [01:24, 35.84it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "创建 股票-概念 关系...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "18776it [12:20, 25.34it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "创建 股票-股东 关系...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "3440it [01:59, 28.71it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "实体 关系 导入成功...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "#执行step2构建图数据库\n",
    "%run step2_store_to_neo4j.py"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81097bd3",
   "metadata": {},
   "source": [
    "## 任务一核心语句调试\n",
    "* 参考文献：\n",
    "* [NLP自学笔记II——fastText文本分类](https://zhuanlan.zhihu.com/p/158882048)\n",
    "* [自然语言处理（NLP）：07 fastText训练中文模型-文本分类](https://blog.csdn.net/shenfuli/article/details/98882655)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4c5bffe6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import fasttext\n",
    "import jieba\n",
    "from config import classifier_corpus_path, classifier_save_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "80ec5057",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['__label__greet', '__label__bot', '__label__goodbye']\n",
      "['</s>', '你', '是', '？', '吗', '啊', '谁', '了', '什么', '机器人', 'good', '再见', '呢', '你好', '名字', '叫', '的', '下次', '人', 'bye', 'morning', '拜拜', 'what', '好', 'hey', '咯', '忙', '呀', '去', '打扰', '哪个', '有', '称呼', '怎么', '问题', '有个', 'hello', '请教', '服务', '人工', '回见', '到底', '事情', '您好', 'hai', 'afternoon', '晚上', '啦', '，', '还是', '聊', 'nishishui', '你是谁', '人类', 'nihao', '下午', '?', 'name', '~', '报上名来', '哈', '喽', 'your', '嘿', '嘿嘿嘿', 'men', '先', '这样', '吧', 'goodbye', '嗨', 'evening', '现在', '营业', '不聊', '在', 'hi', '我', '哪位', 'up', '见', '有人', 'night', '一下', '可以', '问', '早上好', 'who', 'are', 'you', '时间', '来者', '何人', '很', '高心', '见到']\n",
      "75 0.5 1.0\n"
     ]
    }
   ],
   "source": [
    "#训练\n",
    "model=fasttext.train_supervised('./data/classifier/chat.train',lr=0.1,dim=300,epoch=30)\n",
    "model.save_model('./data/classifier/model.bin')\n",
    "#检索标签列表\n",
    "print(model.labels)\n",
    "#检索词列表\n",
    "print(model.words)\n",
    "#计算精确率(Precision)和召回率(Recall)来评估模型\n",
    "#3个标签设置k=2\n",
    "print(*model.test('./data/classifier/chat.train',k=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "6e3151de",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2021-10-19 22:17:58,213 - jieba - DEBUG - Building prefix dict from the default dictionary ...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Dumping model to file cache /var/folders/xp/l5cz0dnx3cs9dvqlvqvt4k540000gn/T/jieba.cache\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2021-10-19 22:17:59,464 - jieba - DEBUG - Dumping model to file cache /var/folders/xp/l5cz0dnx3cs9dvqlvqvt4k540000gn/T/jieba.cache\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model cost 1.350 seconds.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2021-10-19 22:17:59,568 - jieba - DEBUG - Loading model cost 1.350 seconds.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2021-10-19 22:17:59,570 - jieba - DEBUG - Prefix dict has been built successfully.\n",
      "(('__label__greet',), array([0.9331727]))\n",
      "greet\n",
      "0.9331727027893066\n"
     ]
    }
   ],
   "source": [
    "#预测\n",
    "query_intent =model.predict(\" \".join([word for word in jieba.lcut('今天天气好吗')]))\n",
    "print(query_intent)\n",
    "print(query_intent[0][0].replace('__label__', '')) \n",
    "print(query_intent[1][0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da7ee8bd",
   "metadata": {},
   "source": [
    "## 任务二核心语句调试\n",
    "* 参考文献：\n",
    "* [ahocorasick从安装到使用](https://www.cnblogs.com/smartisn/p/14033551.html)\n",
    "* [【正则表达式】pyahocorasick介绍](https://blog.csdn.net/pirage/article/details/51657178)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "44d79673",
   "metadata": {},
   "outputs": [],
   "source": [
    "from config import entity_corpus_path, entity_searcher_save_path, contexts\n",
    "import ahocorasick\n",
    "import pandas as pd\n",
    "import os\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "021d819c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>ts_code</th>\n",
       "      <th>symbol</th>\n",
       "      <th>name</th>\n",
       "      <th>industry</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>000001.SZ</td>\n",
       "      <td>1</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>银行</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>000002.SZ</td>\n",
       "      <td>2</td>\n",
       "      <td>万科A</td>\n",
       "      <td>全国地产</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>000004.SZ</td>\n",
       "      <td>4</td>\n",
       "      <td>国华网安</td>\n",
       "      <td>软件服务</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>000005.SZ</td>\n",
       "      <td>5</td>\n",
       "      <td>ST星源</td>\n",
       "      <td>环境保护</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>000006.SZ</td>\n",
       "      <td>6</td>\n",
       "      <td>深振业A</td>\n",
       "      <td>区域地产</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0    ts_code  symbol  name industry\n",
       "0           0  000001.SZ       1  平安银行       银行\n",
       "1           1  000002.SZ       2   万科A     全国地产\n",
       "2           2  000004.SZ       4  国华网安     软件服务\n",
       "3           3  000005.SZ       5  ST星源     环境保护\n",
       "4           4  000006.SZ       6  深振业A     区域地产"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_folder_path='./data/knowledge/'\n",
    "\n",
    "tree = ahocorasick.Automaton()\n",
    "\n",
    "stock_basic = pd.read_csv(os.path.join(input_folder_path, '股票信息.csv'), encoding='gbk')\n",
    "\n",
    "stock_basic.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "26b73cc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 遍历 stock_basic，添加 name 即股票名字\n",
    "# 股票名字为key，value表示为具体的实体类型，比如：tree.add_word('股票名A', ('股票名A', '股票'))\n",
    "for i in range(len(stock_basic)):\n",
    "    tree.add_word(stock_basic['name'][i], (stock_basic['name'][i],'股票'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "409aaecc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>code</th>\n",
       "      <th>name</th>\n",
       "      <th>src</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TS0</td>\n",
       "      <td>鼠疫</td>\n",
       "      <td>ts</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TS1</td>\n",
       "      <td>黑龙江自贸区</td>\n",
       "      <td>ts</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TS2</td>\n",
       "      <td>黑洞概念</td>\n",
       "      <td>ts</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>TS3</td>\n",
       "      <td>黄金水道</td>\n",
       "      <td>ts</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TS4</td>\n",
       "      <td>黄金概念</td>\n",
       "      <td>ts</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  code    name src\n",
       "0  TS0      鼠疫  ts\n",
       "1  TS1  黑龙江自贸区  ts\n",
       "2  TS2    黑洞概念  ts\n",
       "3  TS3    黄金水道  ts\n",
       "4  TS4    黄金概念  ts"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concept = pd.read_csv(os.path.join(input_folder_path, '概念信息.csv'), encoding='gbk')\n",
    "concept.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "b314833f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 遍历 concept，添加 name 即概念名字\n",
    "# 概念名字为key，value表示为具体的实体类型，比如：tree.add_word('概念名A', ('概念名A', '概念'))\n",
    "for i in range(len(concept)):\n",
    "    tree.add_word(concept['name'][i], (concept['name'][i],'概念'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "701e9952",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>股东名称</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>陈建建</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>台州市杰克投资有限公司</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>王进权</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>孔令国</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>集晶(香港)有限公司</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          股东名称\n",
       "0          陈建建\n",
       "1  台州市杰克投资有限公司\n",
       "2          王进权\n",
       "3          孔令国\n",
       "4   集晶(香港)有限公司"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "holder = pd.read_csv(os.path.join(input_folder_path, '股东信息.csv'), encoding='gbk')\n",
    "holder.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "82b70712",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 遍历 holder，添加 股东名称\n",
    "# 股东名称为key，value表示为具体的实体类型，比如：tree.add_word('股东名称A', ('股东名称A', '股东'))\n",
    "for i in range(len(holder)):\n",
    "    tree.add_word(holder['股东名称'][i], (holder['股东名称'][i],'股东'))\n",
    "tree.make_automaton()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "08b9c9a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "(1, ('鼠疫', '概念'))\n",
      "(4, ('王进权', '股东'))\n"
     ]
    }
   ],
   "source": [
    "#测试tree是否可用\n",
    "print('王进权' in tree)\n",
    "for item in tree.iter(\"鼠疫王进权\"):\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "91d4b998",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "保存成功\n"
     ]
    }
   ],
   "source": [
    "#保存\n",
    "tree_save_path='./checkpoints/entity_searcher/search_tree.pkl'\n",
    "with open(tree_save_path, 'wb') as fout:\n",
    "    pickle.dump(tree, fout)\n",
    "print('保存成功')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4421d264",
   "metadata": {},
   "source": [
    "## 任务三核心语句调试\n",
    "* 参考文献：\n",
    "* [The Py2neo Handbook](https://py2neo.org/2021.1/index.html)\n",
    "* [py2neo基本用法](https://zhuanlan.zhihu.com/p/81175725)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "e5a510d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "frozenset({'概念', '股票', '股东'})\n",
      "frozenset({'所属概念', '持有'})\n"
     ]
    }
   ],
   "source": [
    "from py2neo import Graph\n",
    "graph = Graph('http://localhost:7474/finance_demo/db/', auth=('neo4j', 'neo4j123'))\n",
    "print(graph.schema.node_labels)\n",
    "print(graph.schema.relationship_types)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "365fc529",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "平安银行\n",
      "股票\n",
      "平安银行所属概念是风电\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#概念信息\n",
    "entities = {'平安银行':'股票'}\n",
    "response = \"\"\n",
    "for entity_name, entity_type in entities.items():\n",
    "    print(entity_name)\n",
    "    print(entity_type)\n",
    "    if entity_type == '股票':\n",
    "        cypher_sql = f'MATCH (s:{entity_type})-[r:所属概念]->(c:概念) where s.股票名称 = \"{entity_name}\" return c.概念名称'\n",
    "        rtn = graph.run(cypher_sql).data()\n",
    "        response += f'{entity_name}所属概念是{rtn[0][\"c.概念名称\"]}' + '\\n'\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "ac93680f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "王进权\n",
      "股东\n",
      "王进权持有奥康国际,占比4.98%\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#股东信息\n",
    "entities = {'王进权':'股东'}\n",
    "response = \"\"\n",
    "for entity_name, entity_type in entities.items():\n",
    "    print(entity_name)\n",
    "    print(entity_type)\n",
    "    if entity_type == '股东':\n",
    "        cypher_sql = f'MATCH (s:{entity_type})-[r:持有]->(c:股票) where s.股东名称 = \"{entity_name}\" return c.股票名称, r.hold_ratio'\n",
    "        rtn = graph.run(cypher_sql).data()#在graph_matcher.py要加上self\n",
    "        response += f'{entity_name}持有{rtn[0][\"c.股票名称\"]},占比{rtn[0][\"r.hold_ratio\"]}' +'%'+ '\\n'\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "e695eeeb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "平安银行\n",
      "股票\n",
      "平安银行的行业是银行\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#行业信息\n",
    "entities = {'平安银行':'股票'}\n",
    "response = \"\"\n",
    "for entity_name, entity_type in entities.items():\n",
    "    print(entity_name)\n",
    "    print(entity_type)\n",
    "    if entity_type == '股票':\n",
    "        cypher_sql = f'MATCH (s:{entity_type}) where s.股票名称 = \"{entity_name}\" return s.行业'\n",
    "        rtn = graph.run(cypher_sql).data()#在graph_matcher.py要加上self\n",
    "        response += f'{entity_name}的行业是{rtn[0][\"s.行业\"]}' + '\\n'\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a571f11",
   "metadata": {},
   "source": [
    "## 主语句调试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58bb8c89",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "用户: 宁德时代的行业是\n",
      "机器人: 宁德时代的行业是电气设备\n",
      "用户: 平安银行的概念是\n",
      "机器人: 平安银行所属概念是风电\n",
      "用户: 王进权持有的股票是\n",
      "机器人: 王进权持有奥康国际,占比4.98%\n",
      "用户: 你很聪明\n",
      "机器人: 祝你吃嘛嘛香，身体倍棒\n"
     ]
    }
   ],
   "source": [
    "%run main.py"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
